05_analysis_1.qmd

Author

group_12

#|echo: true
#| message: false
#| warning: false
library("tidyverse")
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggplot2")
library("viridis")
Loading required package: viridisLite

Including the final clean and augmented dataset

final_outbreaks <- read_tsv("../Data/final_outbreaks.tsv")
Rows: 1856 Columns: 26
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (21): Status, iso3, Continent, Country, iso2, icd10n, icd103n, icd104n, ...
dbl  (5): Index, Year, icd11c1, Latitude, Longitude

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Reshaping data set to wide format and adding frequency column (Freq):

geo_outbreaks_wide <- final_outbreaks |>
  count(Country, 
        iso3, 
        icd104c, 
        Continent, 
        Longitude, 
        Latitude) |>
  pivot_wider(
    id_cols = c("Country", 
                "iso3", 
                "Continent", 
                "Longitude", 
                "Latitude"),
    names_from = icd104c,
    values_from = n,
    values_fill = 0
  )
geo_outbreaks_wide <- geo_outbreaks_wide |>
  mutate(Freq = rowSums(select(geo_outbreaks_wide, 
                               where(is.numeric), 
                               -Latitude, 
                               -Longitude)))

Barplot of the 20 Countries with the highest number of outbreaks (1996 - 2022):

# Select the top 20 countries and arrange the data
top_countries <- geo_outbreaks_wide  |> 
  arrange(desc(Freq))  |> 
  slice_head(n = 20)  |> 
  mutate(Country = fct_reorder(Country, 
                               Freq))  |> 
  pull(Country)

# Plot the top 20 countries
fig4a_reproduced <- geo_outbreaks_wide  |> 
  filter(Country %in% top_countries)  |> 
  ggplot(aes(x = fct_reorder(Country, 
                             -Freq), 
             y = Freq, 
             fill = Continent)) +
  geom_bar(stat = "identity") +
  labs(y = "Total frequency of outbreaks (1996 - 2022)",
       x = "Country",
       title = "Top 20 countries with the highest 
       number of outbreaks"
       ) +
  theme_minimal() +
  scale_fill_viridis_d(alpha = 1) +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, size = 8),
        plot.title = element_text(hjust = 0.5))

fig4a_reproduced 

ggsave(fig4a_reproduced, 
       filename = "../Results/fig4a_reproduced.png", 
       dpi = 300)
Saving 7 x 5 in image

Map plot with the spatial frequency of outbreaks

# Download world map data
world <- map_data("world")

fig3_reproduced <- ggplot() +
  geom_polygon(data = world, 
               aes(x = long, 
                   y = lat, 
                   group = group), 
               fill = "white", 
               color = "black") +
  geom_point(data = geo_outbreaks_wide, 
             aes(x = Longitude, 
                 y = Latitude, 
                 color = Freq), 
             size = 3) +
  scale_color_viridis(name = "Frequency", 
                      direction = -1) +
  labs(title = "Disease Frequency by Country",
       x = "Longitude",
       y = "Latitude"
       ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  

fig3_reproduced

ggsave(fig3_reproduced, 
       filename = "../Results/fig3_reproduced.png", 
       dpi = 300)
Saving 7 x 5 in image

Reshaping data set to obtain the total yearly outbreaks:

outbreaks_yearly <- final_outbreaks |> 
  count(Year, 
        icd104c) |> 
  pivot_wider(names_from = icd104c,
              values_from = n,
              values_fill = 0) 
outbreaks_yearly <- outbreaks_yearly |>
  mutate(Freq = rowSums(select(outbreaks_yearly,
                               where(is.numeric), 
                               -Year)))

Barplot of the yearly number of outbreaks:

# Plot the barplot
fig4c_reproduced <- ggplot(outbreaks_yearly, 
                aes(x = Year, 
                    y = Freq)) +
  geom_bar(stat = "identity", 
           fill = "skyblue") +
  labs(title = "Yearly Outbreaks (1996-2022)",
       x = "Year",
       y = "Total Outbreaks"
       ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, 
                                   size = 8),
        plot.title = element_text(hjust = 0.5))

fig4c_reproduced
Warning: Removed 1 rows containing missing values (`position_stack()`).

ggsave(fig4c_reproduced, 
       filename = "../Results/fig4c_reproduced.png", 
       dpi = 300)
Saving 7 x 5 in image
Warning: Removed 1 rows containing missing values (`position_stack()`).